gapminder <- read_csv("~/R/Projects/Datasets/gapminder_clean.csv", col_names = TRUE)
styling1 <- list(
geom_point(),
scale_x_log10(),
scale_y_log10(),
geom_smooth(method = "lm", se = FALSE),
theme_classic(),
labs(
caption = "Source from Gapminder"
)
)
gapminder_1962 <- gapminder %>%
filter(Year == 1962)
ggplot(gapminder_1962, aes(gdpPercap, `CO2 emissions (metric tons per capita)`)) +
styling1 +
labs(title = "CO2 emissions by GDP per capita in 1962",
x = "GDP per capita"
)
The Pearson correlation of 'CO2 emissions (metric tons per capita)' and gdpPercap in the year 1962 is 0.926 (p = <2.2e-16).
cor.test(gapminder_1962$gdpPercap, gapminder_1962$`CO2 emissions (metric tons per capita)`, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: gapminder_1962$gdpPercap and gapminder_1962$`CO2 emissions (metric tons per capita)`
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8934697 0.9489792
## sample estimates:
## cor
## 0.9260817
'CO2 emissions (metric tons per capita)' and gdpPercap the strongest?A: 1967 contains the strongest correlation between 'CO2 emissions (metric tons per capita)' and gdpPercap.
# In what year is the correlation between 'CO2 emissions (metric tons per capita)' and gdpPercap the strongest?
corr_gapminder <- gapminder %>%
group_by(Year) %>%
select(gdpPercap, `CO2 emissions (metric tons per capita)`, Year) %>%
summarize(corr = cor(gdpPercap, `CO2 emissions (metric tons per capita)`, use = "complete.obs")) %>%
arrange(desc(corr))
corr_gapminder
## # A tibble: 10 x 2
## Year corr
## <dbl> <dbl>
## 1 1967 0.939
## 2 1962 0.926
## 3 1972 0.843
## 4 1982 0.817
## 5 1987 0.810
## 6 1992 0.809
## 7 1997 0.808
## 8 2002 0.801
## 9 1977 0.793
## 10 2007 0.720
gapminder_1967 <- gapminder %>%
filter(Year == 1967)
# interactive scatter plot
ggplotly(ggplot(gapminder_1967, aes(gdpPercap, `CO2 emissions (metric tons per capita)`, size = pop, color = continent)) +
styling1 +
labs(title = "CO2 emissions by GDP per capita in 1967",
x = "GDP per capita"
)
)
continent and 'Energy use (kg of oil equivalent per capita)'?A: There is a statistically significant difference [F(4, 516) = 58.316, p = < 2.2e-16] in Energy use (kg of oil equivalent per capita) between continents as determined by one-way ANOVA. Because R-squared is small (~0.3), there is a weak linear relation between the treatment levels and response.
complete_rec <- na.exclude(gapminder)
LM <- lm(`Energy use (kg of oil equivalent per capita)` ~ continent, data = complete_rec)
anova(LM)
## Analysis of Variance Table
##
## Response: Energy use (kg of oil equivalent per capita)
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 4 679754869 169938717 58.316 < 2.2e-16 ***
## Residuals 516 1503687009 2914122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(LM)
##
## Call:
## lm(formula = `Energy use (kg of oil equivalent per capita)` ~
## continent, data = complete_rec)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3132.8 -829.8 -296.8 164.9 13249.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 664.9 147.5 4.509 8.08e-06 ***
## continentAmericas 693.7 214.5 3.233 0.0013 **
## continentAsia 855.7 211.4 4.047 5.97e-05 ***
## continentEurope 2852.5 211.8 13.466 < 2e-16 ***
## continentOceania 3841.4 479.5 8.012 7.59e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1707 on 516 degrees of freedom
## Multiple R-squared: 0.3113, Adjusted R-squared: 0.306
## F-statistic: 58.32 on 4 and 516 DF, p-value: < 2.2e-16
#resid_data <- resid(LM)
#qqnorm(resid_data, pch = 1, frame = FALSE)
#qqline(resid_data)
# boxplot
ggplot(complete_rec, aes(continent, `Energy use (kg of oil equivalent per capita)`)) +
geom_boxplot(color = "red", fill = "orange", alpha = 0.2) +
theme_classic() +
labs(
x = "Continent",
caption = "Source from Gapminder"
)
# summary stats
complete_rec %>%
group_by(continent) %>%
get_summary_stats(`Energy use (kg of oil equivalent per capita)`, type = "mean_sd")
## # A tibble: 5 x 5
## continent variable n mean sd
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Africa Energy use (kg of oil equivalent per capita) 134 665. 597.
## 2 Americas Energy use (kg of oil equivalent per capita) 120 1359. 2041.
## 3 Asia Energy use (kg of oil equivalent per capita) 127 1521. 1952.
## 4 Europe Energy use (kg of oil equivalent per capita) 126 3517. 1943.
## 5 Oceania Energy use (kg of oil equivalent per capita) 14 4506. 816.
# outliers. may try to take out
complete_rec %>%
group_by(continent) %>%
identify_outliers(`Energy use (kg of oil equivalent per capita)`) %>%
select(continent, is.outlier, is.extreme)
## # A tibble: 42 x 3
## continent is.outlier is.extreme
## <chr> <lgl> <lgl>
## 1 Africa TRUE FALSE
## 2 Africa TRUE TRUE
## 3 Africa TRUE TRUE
## 4 Africa TRUE TRUE
## 5 Africa TRUE TRUE
## 6 Africa TRUE TRUE
## 7 Africa TRUE TRUE
## 8 Africa TRUE TRUE
## 9 Africa TRUE TRUE
## 10 Africa TRUE TRUE
## # ... with 32 more rows
# check for normality; qqplot and shapiro
ggqqplot(residuals(LM))
shapiro_test(residuals(LM))
## # A tibble: 1 x 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 residuals(LM) 0.710 5.04e-29
# check for normality, but for each group
ggqqplot(complete_rec, "Energy use (kg of oil equivalent per capita)", facet.by = "continent")
# homogenity of variance
plot(LM, 1)
complete_rec %>% levene_test(`Energy use (kg of oil equivalent per capita)` ~ continent)
## # A tibble: 1 x 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 4 516 8.66 0.000000905
#complete_rec$`Energy use (kg of oil equivalent per capita)` <- log10(complete_rec$`Energy use (kg of oil equivalent per capita)`)
#ggdensity(resid_data, x = "Energy use (kg of oil equivalent per capita)", color = "red", fill = "orange", alpha = 0.2) +
# scale_x_continuous() +
# stat_overlay_normal_density(color = "red", linetype = "dashed")
#shapiro.test(complete_rec$`Energy use (kg of oil equivalent per capita)`)
'Imports of goods and services (% of GDP)' in the years after 1990?A: The mean Imports of goods and services (% of GDP) in Asia and Europe is 46.5 (SD = 37.5) and 41.6 (SD = 16.2), respectively. A Welch two-sample t-test demonstrates there is no statistically significant difference [t(91.9) = 1.04, p = 0.3, d = 0.169], meaning the means of the two populations are the same.
Note: To meet the normality assumption not met in the original data, a sqrt transformation was attempted. While the transformation improved the distribution of the data to normality, it provided no meaningful differences as it led to the same conclusion. Therefore, the analysis was done on the original data.
gapminder_post1990 <- complete_rec %>%
filter(Year > 1990, continent %in% c("Europe", "Asia")) %>%
select(Year, continent, `Imports of goods and services (% of GDP)`)
# qqplot
ggqqplot(gapminder_post1990, x = "Imports of goods and services (% of GDP)", facet.by = "continent")
# welch t test
t.test <- gapminder_post1990 %>%
t_test(`Imports of goods and services (% of GDP)` ~ continent) %>%
add_significance()
t.test
## # A tibble: 1 x 9
## .y. group1 group2 n1 n2 statistic df p p.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Imports of goods and~ Asia Europe 73 99 1.04 91.9 0.3 ns
# cohen's d
gapminder_post1990 %>% cohens_d(`Imports of goods and services (% of GDP)` ~ continent, var.equal = FALSE)
## # A tibble: 1 x 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 Imports of goods and services (% ~ Asia Europe 0.169 73 99 negligib~
# summary stats
gapminder_post1990 %>%
group_by(continent) %>%
get_summary_stats(`Imports of goods and services (% of GDP)`, type = "mean_sd")
## # A tibble: 2 x 5
## continent variable n mean sd
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Asia Imports of goods and services (% of GDP) 73 46.5 37.5
## 2 Europe Imports of goods and services (% of GDP) 99 41.6 16.2
'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)A: The country with the highest average ranking in 'Population density (people per sq. km of land area)' across all years is Macao SAR, China, followed by Monaco.
gapminder_density_overall <- gapminder %>%
group_by(`Country Name`) %>%
summarize(mean_pop_density = mean(`Population density (people per sq. km of land area)`)) %>%
top_n(5, mean_pop_density)
gapminder_density_overall
## # A tibble: 5 x 2
## `Country Name` mean_pop_density
## <chr> <dbl>
## 1 Gibraltar 2622.
## 2 Hong Kong SAR, China 5153.
## 3 Macao SAR, China 14732.
## 4 Monaco 14090.
## 5 Singapore 4361.
ggplot(gapminder_density_overall, aes(x = reorder(`Country Name`,
-mean_pop_density),
y = mean_pop_density)) +
geom_bar(color = "red", fill = "orange", alpha = 0.2, width = 0.7, stat = "identity") +
theme_classic() +
labs(
title = "Countries with the Highest Average Population Density",
x = "Country Name",
y = "Average Population Density",
caption = "Source from Gapminder"
)
'Life expectancy at birth, total (years)' since 1962?A: Cambodia has had the greatest increase (2.44) in 'Life expectancy at birth, total (years)'.
gapminder_life_expect <- gapminder %>%
filter(Year > 1962) %>%
arrange(Year) %>%
group_by(`Country Name`) %>%
mutate(diff_yr = Year - lag(Year),
diff_life_expect = `Life expectancy at birth, total (years)` - lag(`Life expectancy at birth, total (years)`),
rate_per_life = (diff_life_expect / diff_yr)/lag(`Life expectancy at birth, total (years)`) * 100) %>%
summarize(avg_rate_life_expect = mean(rate_per_life, na.rm = TRUE)) %>%
top_n(5, avg_rate_life_expect)
gapminder_life_expect
## # A tibble: 5 x 2
## `Country Name` avg_rate_life_expect
## <chr> <dbl>
## 1 Bhutan 1.66
## 2 Cambodia 2.44
## 3 Maldives 1.53
## 4 Mali 1.50
## 5 Timor-Leste 1.55
ggplot(gapminder_life_expect, aes(x = reorder(`Country Name`,
-avg_rate_life_expect),
y = avg_rate_life_expect)) +
geom_bar(color = "red", fill = "orange", alpha = 0.2, width = 0.7, stat = "identity") +
theme_classic() +
labs(
title = "Countries with the Greatest Increase in Life Expectancy since 1962",
x = "Country Name",
y = "Average Life Expectancy Percent Rate",
caption = "Source from Gapminder"
)